{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#保存数据\n",
    "import cPickle\n",
    "\n",
    "import numpy as np\n",
    "import scipy.io as sio\n",
    "import scipy.sparse as ss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 将所有特征串联起来，构成FE_Train.csv\n",
    "#FE_Test.csv\n",
    "#为最后推荐系统做准备\n",
    "from __future__ import division\n",
    "\n",
    "import cPickle\n",
    "import numpy as np\n",
    "import scipy.io as sio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "生成训练数据...\n",
      "\n",
      "train.csv:500 (userId, eventId)=(123290209, 1887085024)\n",
      "train.csv:1000 (userId, eventId)=(272886293, 199858305)\n",
      "train.csv:1500 (userId, eventId)=(395305791, 1582270949)\n",
      "train.csv:2000 (userId, eventId)=(527523423, 3272728211)\n",
      "train.csv:2500 (userId, eventId)=(651258472, 792632006)\n",
      "train.csv:3000 (userId, eventId)=(811791433, 524756826)\n",
      "train.csv:3500 (userId, eventId)=(985547042, 1269035551)\n",
      "train.csv:4000 (userId, eventId)=(1107615001, 173949238)\n",
      "train.csv:4500 (userId, eventId)=(1236336671, 3849306291)\n",
      "train.csv:5000 (userId, eventId)=(1414301782, 2652356640)\n",
      "train.csv:5500 (userId, eventId)=(1595465532, 955398943)\n",
      "train.csv:6000 (userId, eventId)=(1747091728, 2131379889)\n",
      "train.csv:6500 (userId, eventId)=(1914182220, 955398943)\n",
      "train.csv:7000 (userId, eventId)=(2071842684, 1076364848)\n",
      "train.csv:7500 (userId, eventId)=(2217853337, 3051438735)\n",
      "train.csv:8000 (userId, eventId)=(2338481531, 2525447278)\n",
      "train.csv:8500 (userId, eventId)=(2489551967, 520657921)\n",
      "train.csv:9000 (userId, eventId)=(2650493630, 87962584)\n",
      "train.csv:9500 (userId, eventId)=(2791418962, 4223848259)\n",
      "train.csv:10000 (userId, eventId)=(2903662804, 2791462807)\n",
      "train.csv:10500 (userId, eventId)=(3036141956, 3929507420)\n",
      "train.csv:11000 (userId, eventId)=(3176074542, 3459485614)\n",
      "train.csv:11500 (userId, eventId)=(3285425249, 2271782630)\n",
      "train.csv:12000 (userId, eventId)=(3410667855, 1063772489)\n",
      "train.csv:12500 (userId, eventId)=(3531604778, 2584839423)\n",
      "train.csv:13000 (userId, eventId)=(3686871863, 53495098)\n",
      "train.csv:13500 (userId, eventId)=(3833637800, 2415873572)\n",
      "train.csv:14000 (userId, eventId)=(3944021305, 2096772901)\n",
      "train.csv:14500 (userId, eventId)=(4075466480, 3567240505)\n",
      "train.csv:15000 (userId, eventId)=(4197193550, 1628057176)\n",
      "生成预测数据...\n",
      "\n",
      "test.csv:500 (userId, eventId)=(182290053, 2529072432)\n",
      "test.csv:1000 (userId, eventId)=(433510318, 4244463632)\n",
      "test.csv:1500 (userId, eventId)=(632808865, 2845303452)\n",
      "test.csv:2000 (userId, eventId)=(813611885, 2036538169)\n",
      "test.csv:2500 (userId, eventId)=(1010701404, 303459881)\n",
      "test.csv:3000 (userId, eventId)=(1210932037, 2529072432)\n",
      "test.csv:3500 (userId, eventId)=(1452921099, 2705317682)\n",
      "test.csv:4000 (userId, eventId)=(1623287180, 1626678328)\n",
      "test.csv:4500 (userId, eventId)=(1855201342, 2603032829)\n",
      "test.csv:5000 (userId, eventId)=(2083900381, 2529072432)\n",
      "test.csv:5500 (userId, eventId)=(2318415276, 2509151803)\n",
      "test.csv:6000 (userId, eventId)=(2528161539, 4025975316)\n",
      "test.csv:6500 (userId, eventId)=(2749110768, 4244406355)\n",
      "test.csv:7000 (userId, eventId)=(2927772127, 1532377761)\n",
      "test.csv:7500 (userId, eventId)=(3199685636, 1776393554)\n",
      "test.csv:8000 (userId, eventId)=(3393388475, 680270887)\n",
      "test.csv:8500 (userId, eventId)=(3601169721, 154434302)\n",
      "test.csv:9000 (userId, eventId)=(3828963415, 3067222491)\n",
      "test.csv:9500 (userId, eventId)=(4018723397, 2522610844)\n",
      "test.csv:10000 (userId, eventId)=(4180064266, 2658555390)\n"
     ]
    }
   ],
   "source": [
    "class DataRewriter:\n",
    "  def __init__(self):\n",
    "    # 读入数据做初始化\n",
    "    self.userIndex = cPickle.load(open(\"PE_userIndex.pkl\", 'rb'))\n",
    "    self.eventIndex = cPickle.load(open(\"PE_eventIndex.pkl\", 'rb'))\n",
    "    \n",
    "    self.usersForEvent = cPickle.load(open(\"PE_usersForEvent.pkl\", 'rb'))\n",
    "    self.eventsForUser = cPickle.load(open(\"PE_eventsForUser.pkl\", \"rb\"))\n",
    "    \n",
    "    self.event = cPickle.load(open(\"PE_usersForEvent.pkl\", 'rb'))\n",
    "    \n",
    "    self.userEventScores = sio.mmread(\"PE_userEventScores\").todense()\n",
    "    \n",
    "    self.userSimMatrix = sio.mmread(\"US_userSimMatrix\").todense()\n",
    "    \n",
    "    self.eventPropSim = sio.mmread(\"EV_eventPropSim\").todense()\n",
    "    self.eventContSim = sio.mmread(\"EV_eventContSim\").todense()\n",
    "    \n",
    "    self.numFriends = sio.mmread(\"UF_numFriends\")\n",
    "    self.userFriends = sio.mmread(\"UF_userFriends\").todense()\n",
    "    \n",
    "    self.eventPopularity = sio.mmread(\"EA_eventPopularity\").todense()\n",
    "    \n",
    "    \n",
    "    self.k = 10\n",
    "    self.userLFMMatrix = ss.dok_matrix((self.userEventScores.shape[0], self.k))\n",
    "    self.eventLFMMatrix = ss.dok_matrix((self.userEventScores.shape[1], self.k))\n",
    "    \n",
    "  def userReco(self, userId, eventId):\n",
    "    \"\"\"\n",
    "    根据User-based协同过滤，得到event的推荐度\n",
    "    基本的伪代码思路如下：\n",
    "    for item i\n",
    "      for every other user v that has a preference for i\n",
    "        compute similarity s between u and v\n",
    "        incorporate v's preference for i weighted by s into running aversge\n",
    "    return top items ranked by weighted average\n",
    "    \"\"\"\n",
    "    \n",
    "    # find the users that has preference to this event\n",
    "    user_ids = self.usersForEvent[eventId]\n",
    "    if user_ids is None or len(user_ids) is 0:\n",
    "        return 0\n",
    "    \n",
    "    sum_score = 0\n",
    "    sum_sim = 0\n",
    "    \n",
    "    for uid in user_ids:\n",
    "        if uid != userId:\n",
    "            sim = self.userSimMatrix[self.userIndex[uid], self.userIndex[userId]]\n",
    "            u_score = self.userEventScores[self.userIndex[uid], self.eventIndex[eventId]]\n",
    "            \n",
    "            sum_score += u_score * sim\n",
    "            sum_sim += sim\n",
    "    \n",
    "    if sum_sim == 0:\n",
    "        return 0\n",
    "    \n",
    "    # print sum_score, sum_sim\n",
    "    return float(sum_score) / float(sum_sim)\n",
    "\n",
    "    \n",
    "    #请自行补充eventId对userId推荐度\n",
    "    return 0\n",
    "\n",
    "  def eventReco(self, userId, eventId):         \n",
    "    \"\"\"\n",
    "    根据基于物品的协同过滤，得到Event的推荐度\n",
    "    基本的伪代码思路如下：\n",
    "    for item i \n",
    "      for every item j tht u has a preference for\n",
    "        compute similarity s between i and j\n",
    "        add u's preference for j weighted by s to a running average\n",
    "    return top items, ranked by weighted average\n",
    "    \"\"\"\n",
    "    event_ids = self.eventsForUser[userId]\n",
    "    if event_ids is None or len(event_ids) is 0:\n",
    "        return 0, 0\n",
    "    \n",
    "    sum_p_score = 0\n",
    "    sum_c_score = 0\n",
    "    sum_p_sim = 0\n",
    "    sum_c_sim = 0\n",
    "    \n",
    "    for eid in event_ids:\n",
    "        if eid != eventId:\n",
    "            p_sim = self.eventPropSim[self.eventIndex[eid], self.eventIndex[eventId]]\n",
    "            c_sim = self.eventContSim[self.eventIndex[eid], self.eventIndex[eventId]]\n",
    "            \n",
    "            e_score = self.userEventScores[self.userIndex[userId], self.eventIndex[eid]]\n",
    "            \n",
    "            sum_p_score += e_score * p_sim\n",
    "            sum_c_score += e_score * c_sim\n",
    "            sum_p_sim += p_sim\n",
    "            sum_c_sim += c_sim\n",
    "    \n",
    "    #print sum_p_score, sum_c_score, sum_p_sim, sum_c_sim\n",
    "    return 0 if float(sum_p_sim) == 0 else float(sum_p_score) / float(sum_p_sim), 0 if float(sum_c_sim) == 0 else float(sum_c_score) / float(sum_c_sim)\n",
    "\n",
    "    #pscore = 0\n",
    "    #cscore = 0\n",
    "    \n",
    "    #请自行补充eventId对userId推荐度\n",
    "    #return pscore, cscore\n",
    "\n",
    "  def ModelReco(self, userId, eventId):\n",
    "    # create the result matrix\n",
    "    #请自行补充基于模型的协同过滤\n",
    "    #SVD++/LFM\n",
    "    return self.userLFMMatrix[self.userIndex[userId]].dot(self.eventLFMMatrix[self.eventIndex[eventId]])\n",
    "\n",
    "\n",
    "  def userPop(self, userId):\n",
    "    \"\"\"\n",
    "    基于用户的朋友个数来推断用户的社交程度\n",
    "    主要的考量是如果用户的朋友非常多，可能会更倾向于参加各种社交活动\n",
    "    \"\"\"\n",
    "    if self.userIndex.has_key(userId):\n",
    "      i = self.userIndex[userId]\n",
    "      try:\n",
    "        return self.numFriends[0, i]\n",
    "      except IndexError:\n",
    "        return 0\n",
    "    else:\n",
    "      return 0\n",
    "\n",
    "  def friendInfluence(self, userId):\n",
    "    \"\"\"\n",
    "    朋友对用户的影响\n",
    "    主要考虑用户所有的朋友中，有多少是非常喜欢参加各种社交活动/event的\n",
    "    用户的朋友圈如果都积极参与各种event，可能会对当前用户有一定的影响\n",
    "    \"\"\"\n",
    "    nusers = np.shape(self.userFriends)[1]\n",
    "    i = self.userIndex[userId]\n",
    "    return (self.userFriends[i, :].sum(axis=0) / nusers)[0,0]\n",
    "\n",
    "  def eventPop(self, eventId):\n",
    "    \"\"\"\n",
    "    本活动本身的热度\n",
    "    主要是通过参与的人数来界定的\n",
    "    \"\"\"\n",
    "    i = self.eventIndex[eventId]\n",
    "    return self.eventPopularity[i, 0]\n",
    "\n",
    "    \n",
    "  def rewriteData(self, start=1, train=True, header=True):\n",
    "    \"\"\"\n",
    "    把前面user-based协同过滤 和 item-based协同过滤，以及各种热度和影响度作为特征组合在一起\n",
    "    生成新的训练数据，用于分类器分类使用\n",
    "    \"\"\"\n",
    "    fn = \"train.csv\" if train else \"test.csv\"\n",
    "    fin = open(fn, 'rb')\n",
    "    fout = open(\"data_\" + fn, 'wb')\n",
    "    # write output header\n",
    "    if header:\n",
    "      ocolnames = [\"invited\", \"user_reco\", \"evt_p_reco\",\n",
    "        \"evt_c_reco\", \"user_pop\", \"frnd_infl\", \"evt_pop\"]\n",
    "      if train:\n",
    "        ocolnames.append(\"interested\")\n",
    "        ocolnames.append(\"not_interested\")\n",
    "      fout.write(\",\".join(ocolnames) + \"\\n\")\n",
    "    ln = 0\n",
    "    for line in fin:\n",
    "      ln += 1\n",
    "      if ln < start:\n",
    "        continue\n",
    "      cols = line.strip().split(\",\")\n",
    "      userId = cols[0]\n",
    "      eventId = cols[1]\n",
    "      invited = cols[2]\n",
    "      if ln%500 == 0:\n",
    "          print \"%s:%d (userId, eventId)=(%s, %s)\" % (fn, ln, userId, eventId)\n",
    "      user_reco = self.userReco(userId, eventId)\n",
    "      evt_p_reco, evt_c_reco = self.eventReco(userId, eventId)\n",
    "      user_pop = self.userPop(userId)\n",
    "      frnd_infl = self.friendInfluence(userId)\n",
    "      evt_pop = self.eventPop(eventId)\n",
    "      ocols = [invited, user_reco, evt_p_reco,\n",
    "        evt_c_reco, user_pop, frnd_infl, evt_pop]\n",
    "      if train:\n",
    "        ocols.append(cols[4]) # interested\n",
    "        ocols.append(cols[5]) # not_interested\n",
    "      fout.write(\",\".join(map(lambda x: str(x), ocols)) + \"\\n\")\n",
    "    fin.close()\n",
    "    fout.close()\n",
    "\n",
    "  def rewriteTrainingSet(self):\n",
    "    self.rewriteData(True)\n",
    "\n",
    "  def rewriteTestSet(self):\n",
    "    self.rewriteData(False)\n",
    "\n",
    "\n",
    "dr = DataRewriter()\n",
    "print \"生成训练数据...\\n\"\n",
    "dr.rewriteData(train=True, start=2, header=True)\n",
    "\n",
    "print \"生成预测数据...\\n\"\n",
    "dr.rewriteData(train=False, start=2, header=True)"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
